Package org.terrier.structures.indexing.singlepass.hadoop

Source Code of org.terrier.structures.indexing.singlepass.hadoop.HadoopRunsMerger

/*
* Terrier - Terabyte Retriever
* Webpage: http://terrier.org
* Contact: terrier{a.}dcs.gla.ac.uk
* University of Glasgow - School of Computing Science
* http://www.gla.uk
*
* The contents of this file are subject to the Mozilla Public License
* Version 1.1 (the "License"); you may not use this file except in
* compliance with the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS"
* basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
* the License for the specific language governing rights and limitations
* under the License.
*
* The Original Code is HadoopRunsMerger.java.
*
* The Original Code is Copyright (C) 2004-2011 the University of Glasgow.
* All Rights Reserved.
*
* Contributor(s):
*   Richard McCreadie <richardm{a.}dcs.gla.ac.uk> (original author)
*   Craig Macdonald <craigm{a.}dcs.gla.ac.uk>
*  
*/
package org.terrier.structures.indexing.singlepass.hadoop;

import java.io.IOException;
import java.util.LinkedList;
import java.util.ListIterator;

import org.terrier.structures.BasicLexiconEntry;
import org.terrier.structures.LexiconEntry;
import org.terrier.structures.LexiconOutputStream;
import org.terrier.structures.indexing.singlepass.PostingInRun;
import org.terrier.structures.indexing.singlepass.RunIterator;
import org.terrier.structures.indexing.singlepass.RunIteratorFactory;
import org.terrier.structures.indexing.singlepass.RunsMerger;

/**
* This is the main merger class for Hadoop runs. It provides functionality for
* the merging of lexicons and inverted index shards from the map task indexers.
* @since 2.2
* @author Richard McCreadie and Craig Macdonald
  */
public class HadoopRunsMerger extends RunsMerger {

  /** The data loaded from side-effect files about each map task */
  protected LinkedList<MapData> mapData = null;
 
  /** Number of Reducers Used*/
  protected int numReducers = 0;
  /**
   * Constructs an instance of HadoopRunsMerger.
   * @param _runsSource
   */
  public HadoopRunsMerger(RunIteratorFactory _runsSource) {
    super(_runsSource);
  }

  /**
   * Alternate Merge operation for merging a linked list of runs of the form
   * Hadoop_MapData. This routine merges the multiple runs
   * created during the map process of hadoop indexing as such it corrects for
   * Document id 'shift' caused by random splitting of runs due to flushing and
   * map splitting.
   * @param _mapData - information about the number of documents per map and run. One element for every map.
   * @throws IOException
   */
  public void beginMerge(LinkedList<MapData> _mapData)
  {
    mapData = _mapData;
  }
  /**
   * {@inheritDoc}
   */
  public void endMerge(LexiconOutputStream<String> lexStream) {}
  /**
   * {@inheritDoc}
   */
  public void mergeOne(LexiconOutputStream<String> lexStream) throws Exception
 
    int maxDF = 0;
    RunIterator run = runsSource.createRunIterator(-1);
    HadoopRunPostingIterator _run = (HadoopRunPostingIterator)run;
   
    lastTermWritten = null;
    lastFreq = 0;
    lastDocFreq= 0;
    lastDocument = -1;
    long startOffset = this.getByteOffset();
    byte startBitOffset = this.getBitOffset();
    LexiconEntry le = null;
    // for each run in the list
    int counter = 0;
    //for one term: for each set of postings for that term
    while (run.hasNext()) {
      PostingInRun posting = run.next();
      lastTermWritten = posting.getTerm();
     
      if (posting.getDf() > maxDF)
        maxDF = posting.getDf();
     
      //final int _runMapID = TaskID.forName(_run.getMapNo()).getId();
      //final int runNumber = run.getRunNo();
      final int docOffset = getDocumentOffset(_run.getSplitNo(), _run.getRunNo());
      lastDocument = posting.append(bos, lastDocument, docOffset);
      if (le == null)
        le = posting.getLexiconEntry();
      else
        posting.addToLexiconEntry(le);
      lastFreq += posting.getTF();
      lastDocFreq += posting.getDf();
      counter++;
    }
    le.setTermId(currentTerm++);
    ((BasicLexiconEntry)le).setOffset(startOffset, startBitOffset);
    lexStream.writeNextEntry(lastTermWritten, le);
    numberOfPointers += lastDocFreq;
  }
 
  /** Gets the number of Reducers to Merge for:
   * 1 for single Reducer,
   * &gt;1 for multi-Reducers
   * @return how many reducers are in use.
   */
  public int getNumReducers() {
    return numReducers;
  }

  /** Sets the number of Reducers to Merge for:
   * 1 for single Reducer,
   * &gt;1 for multi-Reducers
   */
  public void setNumReducers(int _numReducers) {
    this.numReducers = _numReducers;
  }
  /**
   * Get the offset for the document based on a split and flush.
   */
  public int getDocumentOffset(int splitNo, int flushNumber) throws IOException {
    int NumPreDocs = 0;
    MapData correctHRD = null;
    for (MapData tempHRD : mapData)
    {
      if (splitNo == tempHRD.getSplitnum() ) {
        //System.out.println("Reducer number : "+reduceNumber+", Splitnum"+tempSplitnum+", Run Map Number : "+_run.getMapNo());
        correctHRD = tempHRD;
        break;
      }
      NumPreDocs += tempHRD.getMapDocs();
    }
    if (correctHRD == null)
      throw new IOException("Did not find map data for split "+ splitNo);
   
    // Add the FlushShift
    int currentFlushDocs=0;
    ListIterator<Integer> LI = correctHRD.getFlushDocSizes().listIterator(0);
    //System.out.println("Runs Flush number : "+run.getRunNo()+", Size of HRD :"+correctHRD.getFlushDocSizes().size());
    int currentFlush =0;
    while (currentFlush<flushNumber) {
      //System.out.println("Runs Flush number : "+run.getRunNo()+", FlushCheck : "+currentFlush+", Size of HRD :"+correctHRD.getFlushDocSizes().size());
      currentFlushDocs += LI.next();
      currentFlush++;
    }
   
    return NumPreDocs+currentFlushDocs;
  }

}
TOP

Related Classes of org.terrier.structures.indexing.singlepass.hadoop.HadoopRunsMerger

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.